There are 3 datasets that provide data on the stations, trips, and weather from 2014-2016.
Station dataset
Trip dataset
Weather dataset contains daily weather information in the service area
The trip file had the headers repeated after the values of a line, I simply got rid of them cancelling the values from the file. I also noticed that the first several rows were repeated and the line with the headers was one of those, so I used the values from the original line to fill in the missing ones.
In [1]:
import pandas as pd
import numpy as np
sets = ['station', 'trip', 'weather']
cycle = {}
for s in sets:
cycle[s] = pd.read_csv('cycle_share/' + s + '.csv')
In [2]:
cycle['trip'].head()
Out[2]:
In [3]:
for df in cycle:
print(df)
print(cycle[df].describe(include='all'))
print('\n')
Gender and year of birth have nulls, I don't think we should drop them because we would lose over 100000 rows; instead we could use the median or mean to replace nulls for the year of birth. Regarding the gender it's not possible to make any replacement, but it should be noted that most of the entries are male.
In [5]:
cycle['trip'].groupby('gender')['trip_id'].count()
Out[5]:
In [39]:
#cycle['trip']['date'] = cycle['trip']['starttime'].apply(lambda x: pd.to_datetime(x[0:x.find(' ')], format='%m/%d/%Y'))
cycle['trip']['date'] = cycle['trip']['starttime'].apply(lambda x: x[0:x.find(' ')])
cycle['trip'].head()
Out[39]:
In [40]:
trip_weather = pd.merge(cycle['trip'], cycle['weather'], left_on='date', right_on='Date', how='left')
trip_weather.head()
Out[40]:
In [68]:
print(len(trip_weather))
trip_weather = trip_weather.drop_duplicates()
print(len(trip_weather))
In [72]:
print(len(trip_weather['trip_id']))
print(len(trip_weather['trip_id'].unique()))
In [80]:
trip_weather = pd.merge(trip_weather, cycle['station'][['station_id', 'lat', 'long']], left_on='from_station_id', right_on='station_id', how='left').drop('station_id', axis=1)
trip_weather = pd.merge(trip_weather, cycle['station'][['station_id', 'lat', 'long']], left_on='to_station_id', right_on='station_id', how='left', suffixes=['_from_station', '_to_station']).drop('station_id', axis=1)
trip_weather.head()
Out[80]:
In [129]:
def round_trips(duration):
roundings = np.array([np.floor(duration), np.floor(duration)+0.5, np.ceil(duration)])
return roundings[np.argmin(np.abs(duration - roundings))]
trip_weather['tripduration'] = trip_weather['tripduration'].apply(round_trips)
In [130]:
trip_weather['tripduration'].head(10)
Out[130]:
In [140]:
trip_weather[np.abs(((pd.to_datetime(trip_weather['stoptime']) - pd.to_datetime(trip_weather['starttime'])) / np.timedelta64(1, 's')) - trip_weather['tripduration']) > 60]
Out[140]:
In [154]:
# not an int, let's convert it
trip_weather['Max_Gust_Speed_MPH'] = trip_weather['Max_Gust_Speed_MPH'].replace('-', np.NaN).astype('float')
In [155]:
trip_weather['Max_Gust_Speed_MPH'].describe()
Out[155]:
In [156]:
trip_weather.to_csv('cycle_share/trip_weather.csv')
This data set contains 28 attributes related to various movie titles that have been scraped from IMDb. The set is supposed to contain unique titles for each record, where each record has the following attributes:
"movie_title" "color" "num_critic_for_reviews" "movie_facebook_likes" "duration" "director_name" "director_facebook_likes" "actor_3_name" "actor_3_facebook_likes" "actor_2_name" "actor_2_facebook_likes" "actor_1_name" "actor_1_facebook_likes" "gross" "genres" "num_voted_users" "cast_total_facebook_likes" "facenumber_in_poster" "plot_keywords" "movie_imdb_link" "num_user_for_reviews" "language" "country" "content_rating" "budget" "title_year" "imdb_score" "aspect_ratio"
The original set is available kaggle (here)
In [157]:
movies = pd.read_csv('movies/movies_data.csv')
movies.head()
Out[157]:
In [224]:
movies.dtypes
Out[224]:
In [159]:
movies.describe(include='all')
Out[159]:
In [180]:
print(movies['color'].unique())
movies['color'] = movies['color'].apply(lambda x: 'Color' if x == 'color' else 'Black and White' if x == 'black and white' else x)
print(movies['color'].unique())
In [210]:
def find_spaces(df):
cols = []
for index, value in df.dtypes[df.dtypes == 'object'].iteritems():
if df[index].str.startswith(' ').any() | df[index].str.endswith(' ').any():
cols.append(index)
return cols
find_spaces(movies)
Out[210]:
In [217]:
for col in find_spaces(movies):
movies[col] = movies[col].str.lstrip().str.rstrip()
find_spaces(movies)
Out[217]:
In [221]:
print(len(movies))
movies = movies.drop_duplicates()
print(len(movies))
In [252]:
title_duplicates = list(movies['movie_title'].value_counts()[movies['movie_title'].value_counts() > 1].index)
movies[movies['movie_title'].isin(title_duplicates)].sort_values(by='movie_title')
Out[252]:
In [256]:
print(movies.loc[337])
print(movies.loc[4584])
In [269]:
true_dup = []
false_dup = []
for title in title_duplicates:
for index, value in movies[movies['movie_title'] == title]['movie_imdb_link'].value_counts().iteritems():
if value > 1:
true_dup.append(title)
else:
false_dup.append(title)
break
print(true_dup)
print(false_dup)
In [299]:
movies['movie_title'] = movies.apply(lambda x: x['movie_title'] + ' (' + str(int(x['title_year'])) + ')' if str(x['title_year']) != 'nan' and x['movie_title'] in false_dup else x['movie_title'], axis=1)
In [303]:
print(len(movies))
movies = movies.drop_duplicates('movie_title')
print(len(movies))
In [322]:
actors = movies.groupby(['actor_1_name'])['movie_title'].count()
actors = actors.add(movies.groupby(['actor_2_name'])['movie_title'].count(), fill_value=0)
actors = actors.add(movies.groupby(['actor_3_name'])['movie_title'].count(), fill_value=0)
(actors / len(movies)).sort_values(ascending=False).head(20)
Out[322]:
In [353]:
actor_years = movies.groupby(['actor_1_name'])['title_year'].aggregate({'min_year_1': np.min, 'max_year_1': np.max})
actor_years = actor_years.add(movies.groupby(['actor_2_name'])['title_year'].aggregate({'min_year_2': np.min, 'max_year_2': np.max}), fill_value=0)
actor_years = actor_years.add(movies.groupby(['actor_3_name'])['title_year'].aggregate({'min_year_3': np.min, 'max_year_3': np.max}), fill_value=0)
actor_years['first_year'] = np.min(actor_years[['min_year_1', 'min_year_2', 'min_year_3']], axis=1)
actor_years['last_year'] = np.max(actor_years[['max_year_1', 'max_year_2', 'max_year_3']], axis=1)
actor_years.drop(['min_year_1', 'min_year_2', 'min_year_3', 'max_year_1', 'max_year_2', 'max_year_3'], axis=1, inplace=True)
actor_years['history_length'] = actor_years['last_year'] - actor_years['first_year']
actor_years['movie_number'] = actors
actor_years['movie_proportion'] = actors / len(movies)
actor_years
Out[353]:
In [372]:
movies['gross_sales_rank'] = movies['gross'].rank(method='dense', ascending=False, na_option='bottom')
movies[['movie_title', 'gross', 'gross_sales_rank']].sort_values(by='gross_sales_rank').head(20)
Out[372]: